#Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
#turning off warnings for final notebook
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
sns.set_context('notebook')
sns.set_style('whitegrid')
sns.set_palette('Blues_r')
#importing the file
df = pd.read_csv('marketing_data.csv')
df.head()
| ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumDealsPurchases | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Response | Complain | Country | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1826 | 1970 | Graduation | Divorced | $84,835.00 | 0 | 0 | 6/16/14 | 0 | 189 | 104 | 379 | 111 | 189 | 218 | 1 | 4 | 4 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | SP |
| 1 | 1 | 1961 | Graduation | Single | $57,091.00 | 0 | 0 | 6/15/14 | 0 | 464 | 5 | 64 | 7 | 0 | 37 | 1 | 7 | 3 | 7 | 5 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | CA |
| 2 | 10476 | 1958 | Graduation | Married | $67,267.00 | 0 | 1 | 5/13/14 | 0 | 134 | 11 | 59 | 15 | 2 | 30 | 1 | 3 | 2 | 5 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | US |
| 3 | 1386 | 1967 | Graduation | Together | $32,474.00 | 1 | 1 | 5/11/14 | 0 | 10 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 2 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | AUS |
| 4 | 5371 | 1989 | Graduation | Single | $21,474.00 | 1 | 0 | 4/8/14 | 0 | 6 | 16 | 24 | 11 | 0 | 34 | 2 | 3 | 1 | 2 | 7 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | SP |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2240 entries, 0 to 2239 Data columns (total 28 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 2240 non-null int64 1 Year_Birth 2240 non-null int64 2 Education 2240 non-null object 3 Marital_Status 2240 non-null object 4 Income 2216 non-null object 5 Kidhome 2240 non-null int64 6 Teenhome 2240 non-null int64 7 Dt_Customer 2240 non-null object 8 Recency 2240 non-null int64 9 MntWines 2240 non-null int64 10 MntFruits 2240 non-null int64 11 MntMeatProducts 2240 non-null int64 12 MntFishProducts 2240 non-null int64 13 MntSweetProducts 2240 non-null int64 14 MntGoldProds 2240 non-null int64 15 NumDealsPurchases 2240 non-null int64 16 NumWebPurchases 2240 non-null int64 17 NumCatalogPurchases 2240 non-null int64 18 NumStorePurchases 2240 non-null int64 19 NumWebVisitsMonth 2240 non-null int64 20 AcceptedCmp3 2240 non-null int64 21 AcceptedCmp4 2240 non-null int64 22 AcceptedCmp5 2240 non-null int64 23 AcceptedCmp1 2240 non-null int64 24 AcceptedCmp2 2240 non-null int64 25 Response 2240 non-null int64 26 Complain 2240 non-null int64 27 Country 2240 non-null object dtypes: int64(23), object(5) memory usage: 490.1+ KB
df.columns
Index(['ID', 'Year_Birth', 'Education', 'Marital_Status', ' Income ',
'Kidhome', 'Teenhome', 'Dt_Customer', 'Recency', 'MntWines',
'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
'AcceptedCmp2', 'Response', 'Complain', 'Country'],
dtype='object')
df.isnull().sum()
ID 0 Year_Birth 0 Education 0 Marital_Status 0 Income 24 Kidhome 0 Teenhome 0 Dt_Customer 0 Recency 0 MntWines 0 MntFruits 0 MntMeatProducts 0 MntFishProducts 0 MntSweetProducts 0 MntGoldProds 0 NumDealsPurchases 0 NumWebPurchases 0 NumCatalogPurchases 0 NumStorePurchases 0 NumWebVisitsMonth 0 AcceptedCmp3 0 AcceptedCmp4 0 AcceptedCmp5 0 AcceptedCmp1 0 AcceptedCmp2 0 Response 0 Complain 0 Country 0 dtype: int64
#lets work with the null values of Income
#We need to plot a feature to identify the best strategy for imputation
plt.figure(figsize=(8,4))
sns.distplot(df['Income'], kde=False, hist=True)
plt.title('Income distribution', size=16)
plt.ylabel('count');
df['Income'].plot(kind='box', figsize=(3,4), patch_artist=True)
<AxesSubplot:>
#Finding from the above graph:
#most income is distributed between $0-$100,000 with a few outliers
#adding median values to the null values to remove the outliers
df['Income'] = df['Income'].fillna(df['Income']).median()
#checking null values incase
df['Income'].isnull().sum()
0
#cleaning the column space containing whitespaces
df.columns = df.columns.str.replace(' ', '')
df.columns
#we are good to go.
Index(['ID', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
'AcceptedCmp2', 'Response', 'Complain', 'Country'],
dtype='object')
df.head()
| ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumDealsPurchases | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Response | Complain | Country | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1826 | 1970 | Graduation | Divorced | $84,835.00 | 0 | 0 | 6/16/14 | 0 | 189 | 104 | 379 | 111 | 189 | 218 | 1 | 4 | 4 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | SP |
| 1 | 1 | 1961 | Graduation | Single | $57,091.00 | 0 | 0 | 6/15/14 | 0 | 464 | 5 | 64 | 7 | 0 | 37 | 1 | 7 | 3 | 7 | 5 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | CA |
| 2 | 10476 | 1958 | Graduation | Married | $67,267.00 | 0 | 1 | 5/13/14 | 0 | 134 | 11 | 59 | 15 | 2 | 30 | 1 | 3 | 2 | 5 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | US |
| 3 | 1386 | 1967 | Graduation | Together | $32,474.00 | 1 | 1 | 5/11/14 | 0 | 10 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 2 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | AUS |
| 4 | 5371 | 1989 | Graduation | Single | $21,474.00 | 1 | 0 | 4/8/14 | 0 | 6 | 16 | 24 | 11 | 0 | 34 | 2 | 3 | 1 | 2 | 7 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | SP |
Now, we need to change the format of the 'Income' column to Float and do a reformatting to remove '$' and ',' from the data
df['Income'] = df['Income'].str.replace(' ', '')
#removing $ and , from the data and transforming into numerical column
df['Income']=df['Income'].str.replace('$','')
df['Income']=df['Income'].str.replace(',','').astype('float')
#new dataset
df.head()
| ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumDealsPurchases | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Response | Complain | Country | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1826 | 1970 | Graduation | Divorced | 84835.0 | 0 | 0 | 6/16/14 | 0 | 189 | 104 | 379 | 111 | 189 | 218 | 1 | 4 | 4 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | SP |
| 1 | 1 | 1961 | Graduation | Single | 57091.0 | 0 | 0 | 6/15/14 | 0 | 464 | 5 | 64 | 7 | 0 | 37 | 1 | 7 | 3 | 7 | 5 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | CA |
| 2 | 10476 | 1958 | Graduation | Married | 67267.0 | 0 | 1 | 5/13/14 | 0 | 134 | 11 | 59 | 15 | 2 | 30 | 1 | 3 | 2 | 5 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | US |
| 3 | 1386 | 1967 | Graduation | Together | 32474.0 | 1 | 1 | 5/11/14 | 0 | 10 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 2 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | AUS |
| 4 | 5371 | 1989 | Graduation | Single | 21474.0 | 1 | 0 | 4/8/14 | 0 | 6 | 16 | 24 | 11 | 0 | 34 | 2 | 3 | 1 | 2 | 7 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | SP |
#to check for the changes in the data type
print(df['Income'].dtypes)
float64
#Identifying features containing outliers
# select columns to plot
df_to_plot = df.drop(columns=['ID', 'AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Response', 'Complain']).select_dtypes(include=np.number)
# subplots
df_to_plot.plot(subplots=True, layout=(4,4), kind='box', figsize=(12,14), patch_artist=True)
plt.subplots_adjust(wspace=0.5);
Multiple features contain outliers (see boxplots below), but the only that likely indicate data entry errors are Year_Birth <= 1900. So, we have to remove them.
df= df[df['Year_Birth']>1900].reset_index(drop=True)
#checking for the same:
plt.figure(figsize=(3,4))
df['Year_Birth'].plot(kind='box', patch_artist=True)
<AxesSubplot:>
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2237 entries, 0 to 2236 Data columns (total 28 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 2237 non-null int64 1 Year_Birth 2237 non-null int64 2 Education 2237 non-null object 3 Marital_Status 2237 non-null object 4 Income 2237 non-null float64 5 Kidhome 2237 non-null int64 6 Teenhome 2237 non-null int64 7 Dt_Customer 2237 non-null object 8 Recency 2237 non-null int64 9 MntWines 2237 non-null int64 10 MntFruits 2237 non-null int64 11 MntMeatProducts 2237 non-null int64 12 MntFishProducts 2237 non-null int64 13 MntSweetProducts 2237 non-null int64 14 MntGoldProds 2237 non-null int64 15 NumDealsPurchases 2237 non-null int64 16 NumWebPurchases 2237 non-null int64 17 NumCatalogPurchases 2237 non-null int64 18 NumStorePurchases 2237 non-null int64 19 NumWebVisitsMonth 2237 non-null int64 20 AcceptedCmp3 2237 non-null int64 21 AcceptedCmp4 2237 non-null int64 22 AcceptedCmp5 2237 non-null int64 23 AcceptedCmp1 2237 non-null int64 24 AcceptedCmp2 2237 non-null int64 25 Response 2237 non-null int64 26 Complain 2237 non-null int64 27 Country 2237 non-null object dtypes: float64(1), int64(23), object(4) memory usage: 489.5+ KB
We found that the column 'Dt_Customer' should be coverted to Datetime format
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'])
#to check:
print(df['Dt_Customer'].dtypes)
datetime64[ns]
list(df.columns)
['ID', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome', 'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1', 'AcceptedCmp2', 'Response', 'Complain', 'Country']
# Dependents
df['Dependents']= df['Kidhome']+df['Teenhome']
# Year becoming a Customer
df['Year_Customer'] = pd.DatetimeIndex(df['Dt_Customer']).year
# Total Amount Spent
mnt_cols = [col for col in df.columns if 'Mnt' in col]
df['TotalMnt'] = df[mnt_cols].sum(axis=1)
# Total Purchases
purchases_cols = [col for col in df.columns if 'Purchases' in col]
df['TotalPurchases'] = df[purchases_cols].sum(axis=1)
# Total Campaigns Accepted
campaigns_cols = [col for col in df.columns if 'Cmp' in col] + ['Response'] # 'Response' is for the latest campaign
df['TotalCampaignsAcc'] = df[campaigns_cols].sum(axis=1)
# view new features, by customer ID
df[['ID', 'Dependents', 'Year_Customer', 'TotalMnt', 'TotalPurchases', 'TotalCampaignsAcc']].head()
| ID | Dependents | Year_Customer | TotalMnt | TotalPurchases | TotalCampaignsAcc | |
|---|---|---|---|---|---|---|
| 0 | 1826 | 0 | 2014 | 4760 | 30 | 1 |
| 1 | 1 | 0 | 2014 | 2308 | 36 | 2 |
| 2 | 10476 | 1 | 2014 | 1004 | 22 | 0 |
| 3 | 1386 | 2 | 2014 | 44 | 8 | 0 |
| 4 | 5371 | 1 | 2014 | 364 | 16 | 2 |
To identify patterns, we will first identify feature correlations. positive correlations between features appear red, negative correlations appear blue, and no correlation appears grey in the clustered heatmap below.
# calculate correlation matrix
## using non-parametric test of correlation (kendall), since some features are binary
corrs = df.drop(columns='ID').select_dtypes(include=np.number).corr(method = 'kendall')
# plot clustered heatmap of correlations
sns.heatmap(df.corr(), cmap='coolwarm', center=0)
<AxesSubplot:>
#Plot illustrating negative effect of having dependents (kids & teens) on spending:
plt.figure(figsize=(4,4))
sns.boxplot(x='Dependents', y='TotalMnt', data=df);
#Plot illustrating positive effect of having dependents (kids & teens) on number of deals purchased:
plt.figure(figsize=(4,4))
sns.boxplot(x='Dependents', y='NumDealsPurchases', data=df);
df.head()
| ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumDealsPurchases | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Response | Complain | Country | Dependents | TotalMnt | Year_Customer | TotalPurchases | TotalCampaignsAcc | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1826 | 1970 | Graduation | Divorced | 51381.5 | 0 | 0 | 2014-06-16 | 0 | 189 | 104 | 379 | 111 | 189 | 218 | 1 | 4 | 4 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | SP | 0 | 4760 | 2014 | 30 | 1 |
| 1 | 1 | 1961 | Graduation | Single | 51381.5 | 0 | 0 | 2014-06-15 | 0 | 464 | 5 | 64 | 7 | 0 | 37 | 1 | 7 | 3 | 7 | 5 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | CA | 0 | 2308 | 2014 | 36 | 2 |
| 2 | 10476 | 1958 | Graduation | Married | 51381.5 | 0 | 1 | 2014-05-13 | 0 | 134 | 11 | 59 | 15 | 2 | 30 | 1 | 3 | 2 | 5 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | US | 1 | 1004 | 2014 | 22 | 0 |
| 3 | 1386 | 1967 | Graduation | Together | 51381.5 | 1 | 1 | 2014-05-11 | 0 | 10 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 2 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | AUS | 2 | 44 | 2014 | 8 | 0 |
| 4 | 5371 | 1989 | Graduation | Single | 51381.5 | 1 | 0 | 2014-04-08 | 0 | 6 | 16 | 24 | 11 | 0 | 34 | 2 | 3 | 1 | 2 | 7 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | SP | 1 | 364 | 2014 | 16 | 2 |
#Using boxplot to plot correlation between TotalCampaignsAcc and Dependents
plt.figure(figsize=(5.5,4))
sns.boxplot(x='TotalCampaignsAcc', y='Dependents', data=df);
Comparing two lmplots:
One between Number of Web Visits per month and Number of Web Purchases
Second between Number of Web Visits per month and Number of Discount Purchases
sns.lmplot(x='NumWebVisitsMonth', y='NumWebPurchases', data=df)
sns.lmplot(x='NumWebVisitsMonth', y='NumDealsPurchases', data=df)
Number of web visits in the last month is not positively correlated with number of web purchases Instead, it is positively correlated with the number of deals purchased(as the line is positive), suggesting that deals are an effective way of stimulating purchases on the website
df.head()
| ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumDealsPurchases | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Response | Complain | Country | Dependents | TotalMnt | Year_Customer | TotalPurchases | TotalCampaignsAcc | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1826 | 1970 | Graduation | Divorced | 51381.5 | 0 | 0 | 2014-06-16 | 0 | 189 | 104 | 379 | 111 | 189 | 218 | 1 | 4 | 4 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | SP | 0 | 4760 | 2014 | 30 | 1 |
| 1 | 1 | 1961 | Graduation | Single | 51381.5 | 0 | 0 | 2014-06-15 | 0 | 464 | 5 | 64 | 7 | 0 | 37 | 1 | 7 | 3 | 7 | 5 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | CA | 0 | 2308 | 2014 | 36 | 2 |
| 2 | 10476 | 1958 | Graduation | Married | 51381.5 | 0 | 1 | 2014-05-13 | 0 | 134 | 11 | 59 | 15 | 2 | 30 | 1 | 3 | 2 | 5 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | US | 1 | 1004 | 2014 | 22 | 0 |
| 3 | 1386 | 1967 | Graduation | Together | 51381.5 | 1 | 1 | 2014-05-11 | 0 | 10 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 2 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | AUS | 2 | 44 | 2014 | 8 | 0 |
| 4 | 5371 | 1989 | Graduation | Single | 51381.5 | 1 | 0 | 2014-04-08 | 0 | 6 | 16 | 24 | 11 | 0 | 34 | 2 | 3 | 1 | 2 | 7 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | SP | 1 | 364 | 2014 | 16 | 2 |
We need to present our statistical analysis by using Linear Regression Model on the column 'NumStorePurchases' as our target variable and then use Machine Learning techniques to get insights about which features predict the number of store purchases.
#Plotting the target variable:
plt.figure(figsize=(8,3))
sns.distplot(df['NumStorePurchases'], kde=False , hist=True, bins=12)
plt.title('NumStorePurchases Distribution', size=16)
plt.ylabel('Counts');
Dropping uninformative Features:
ID is unique to each customer
Dt_Customer is dropped because we will use the Year_Customer engineered variable.
We will perform one-hot encoding technique of categorical features
#dropping uninformative columns
df.drop(columns=['ID', 'Dt_Customer'], inplace=True)
#one-hot encoding
from sklearn.preprocessing import OneHotEncoder
#get categorical features and review number of unique values
cat = df.select_dtypes(exclude = np.number)
print('Number of unique values per categorical feature: \n', cat.nunique())
Number of unique values per categorical feature: Education 5 Marital_Status 8 Country 8 dtype: int64
enc = OneHotEncoder(sparse =False).fit(cat)
cat_encoded = pd.DataFrame(enc.transform(cat))
cat_encoded.columns = enc.get_feature_names(cat.columns)
#merge with numeric data
num=df.drop(columns=cat.columns)
df2 = pd.concat([cat_encoded, num], axis=1)
df2.head()
| Education_2n Cycle | Education_Basic | Education_Graduation | Education_Master | Education_PhD | Marital_Status_Absurd | Marital_Status_Alone | Marital_Status_Divorced | Marital_Status_Married | Marital_Status_Single | Marital_Status_Together | Marital_Status_Widow | Marital_Status_YOLO | Country_AUS | Country_CA | Country_GER | Country_IND | Country_ME | Country_SA | Country_SP | Country_US | Year_Birth | Income | Kidhome | Teenhome | Recency | MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumDealsPurchases | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Response | Complain | Dependents | TotalMnt | Year_Customer | TotalPurchases | TotalCampaignsAcc | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1970 | 51381.5 | 0 | 0 | 0 | 189 | 104 | 379 | 111 | 189 | 218 | 1 | 4 | 4 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 4760 | 2014 | 30 | 1 |
| 1 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1961 | 51381.5 | 0 | 0 | 0 | 464 | 5 | 64 | 7 | 0 | 37 | 1 | 7 | 3 | 7 | 5 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 2308 | 2014 | 36 | 2 |
| 2 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1958 | 51381.5 | 0 | 1 | 0 | 134 | 11 | 59 | 15 | 2 | 30 | 1 | 3 | 2 | 5 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1004 | 2014 | 22 | 0 |
| 3 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1967 | 51381.5 | 1 | 1 | 0 | 10 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 2 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 44 | 2014 | 8 | 0 |
| 4 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1989 | 51381.5 | 1 | 0 | 0 | 6 | 16 | 24 | 11 | 0 | 34 | 2 | 3 | 1 | 2 | 7 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 364 | 2014 | 16 | 2 |
From the above data, we can see that our One-Hot encoding technique is working fine.
We will use Linear Regression model to our dataset. 70% of data will go into training dataset and 30% of data will go into testing data set.
We will use RSME on our testing data
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
#performing train_test_split and isolating X and y variables
X= df2.drop(columns='NumStorePurchases')
y= df2['NumStorePurchases']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
#implementing LinearRegression model
model =LinearRegression()
model.fit(X_test, y_test)
#predictions
predictions = model.predict(X_test)
#evaluate the model using RSME
print('Linear Regression using RSME:'), np.sqrt(mean_squared_error(y_test, predictions))
Linear Regression using RSME:
(None, 2.052254475267509e-13)
#median value of target variable
print('The median of target variable:', y.median())
The median of target variable: 5.0
Here, as you can see the RSME is extremely small as compared to the median value, indicating good model predictions.
Identifying significant features that affect the number of store purchases, using permutation importance:
import eli5
from eli5.sklearn import PermutationImportance
perm = PermutationImportance(model, random_state=1).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist(), top=5)
| Weight | Feature |
|---|---|
| 11.5613 ± 0.3298 | TotalPurchases |
| 1.8977 ± 0.1119 | NumCatalogPurchases |
| 1.4006 ± 0.1682 | NumWebPurchases |
| 0.7977 ± 0.0609 | NumDealsPurchases |
| 0 ± 0.0000 | Country_CA |
| … 43 more … | |
Significant Features: 'TotalPurchases', 'NumCatalogPurchases', 'NumWebPurchases', 'NumDealsPurchases'
Explore the directionality of these effects, using SHAP values:
import shap
# calculate shap values
ex = shap.Explainer(model, X_train)
shap_values = ex(X_test)
# plot
plt.title('SHAP summary for NumStorePurchases', size=16)
shap.plots.beeswarm(shap_values, max_display=5);
From the above shap plot, we can see that:
#Plotting total number of purchases by country:
plt.figure(figsize=(5,4))
df.groupby('Country')['TotalPurchases'].sum().sort_values(ascending=False).plot(kind='bar')
plt.title('Total Number of Purchases by Country', size=16)
plt.ylabel('Number of Purchases')
Inference from the above barchart:
#Plotting total amount spent by country:
plt.figure(figsize=(5,4))
df.groupby('Country')['TotalMnt'].sum().sort_values(ascending=False).plot(kind='bar')
plt.title('Total Amount Spent by Country', size=16)
plt.ylabel('Amount Spent')
Text(0, 0.5, 'Amount Spent')
Inference from the above barchart:
We can assume a case where people who spent an above average amount on gold in the last 2 years would have more in store purchases. We will check by using lmplot from seaborn of two columns MntGoldProds and NumStorePurchases.
sns.lmplot(x='MntGoldProds', y='NumStorePurchases', data= df)
<seaborn.axisgrid.FacetGrid at 0x19a490734c0>
There is a positive relationship but we have to find out whether it is statistically significant.
MntGoldProds contains outliers so we need to perform Kendall correlation analysis(non-parametric test)
from scipy.stats import kendalltau
kendall_corr = kendalltau(x=df['MntGoldProds'], y=df['NumStorePurchases'])
# print results
print('Kendall correlation (tau): ', kendall_corr.correlation)
print('Kendall p-value: ', kendall_corr.pvalue)
Kendall correlation (tau): 0.3927160395725131 Kendall p-value: 3.55881817905435e-152
Yes, there is a significant positive corelation between MntGoldProds and NumStoreProcedures
Fish has Omega 3 fatty acids which are good for the brain. Accordingly, do "Married PhD candidates" have a significant relation with amount spent on fish?
We will compare 'MntFishProducts' between Married PHD candidates and all other candidates.
# sum the marital status and phd dummy variables - the Married+PhD group will have value of 2
df2['Married_PhD'] = df2['Marital_Status_Married'] + df2['Education_PhD']
df2['Married_PhD'] = df2['Married_PhD'].replace({2:'Married-PhD', 1:'Other', 0:'Other'})
# plot MntFishProducts between Married-PhD and others
plt.figure(figsize=(2.5,4))
sns.boxplot(x='Married_PhD', y='MntFishProducts', data=df2, palette="Set2");
Married PhD candidates spent less amount on Fishproducts as compared to other candidates.
# independent t-test p-value
from scipy.stats import ttest_ind
pval = ttest_ind(df2[df2['Married_PhD'] == 'Married-PhD']['MntFishProducts'], df2[df2['Married_PhD'] == 'Other']['MntFishProducts']).pvalue
print("t-test p-value: ", round(pval, 3))
t-test p-value: 0.005
# now drop the married-phD column created above, to include only the original variables in the analysis below
df2.drop(columns='Married_PhD', inplace=True)
Like the NumStorePurchases LinearRegression model performed above, we will create another LinearRegression model on MntFishProducts as our target variable, and then use Machine Learning Algorithms to get insights about which features predict the amount spend on fish.
plt.figure(figsize=(10,6))
sns.distplot(df['MntFishProducts'], kde=False, hist=True, bins=12)
plt.title('MntFishProducts distribution', size=16)
plt.ylabel('count')
Text(0, 0.5, 'count')
#we will create the Linearregression model now.
X= df2.drop(columns='MntFishProducts')
y= df2['MntFishProducts']
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=1)
model = LinearRegression()
model.fit(X_train, y_train)
LinearRegression()
predict = model.predict(X_test)
print('Linear Regression Model, RMSE:', np.sqrt(mean_squared_error(y_test,predict)))
print('\n')
print('The median value of target variable:', y.median())
Linear Regression Model, RMSE: 8.5839828444711e-13 The median value of target variable: 12.0
As it is clear that the RSME is much smaller than the target variable, so our model predictions is doing extremely well.
Identify features that significantly affect the amount spent on fish, using permutation importance
perm = PermutationImportance(model, random_state=1).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist(), top=7)
| Weight | Feature |
|---|---|
| 227.8780 ± 18.8462 | TotalMnt |
| 70.7519 ± 3.9459 | MntWines |
| 33.8252 ± 2.0694 | MntMeatProducts |
| 1.8676 ± 0.0755 | MntGoldProds |
| 1.1637 ± 0.0601 | MntSweetProducts |
| 1.0509 ± 0.0710 | MntFruits |
| 0 ± 0.0000 | Country_IND |
| … 41 more … | |
Significant Features: 'TotalMnt', 'MntWines', 'MntMeatProducts', 'MntGoldProds','MntSweetProducts','MntFruits'
#We will follow the same technique of Shap values as we have used above:
import shap
# calculate shap values
ex = shap.Explainer(model, X_train)
shap_values = ex(X_test)
# plot
plt.title('SHAP summary for MntFishProducts', size=16)
shap.plots.beeswarm(shap_values, max_display=7);
Findings:
So, the customer who spent more on fish are likely to spent less on other products like Wines, Meat, Gold, Fruits, Sweet products
# convert country codes to correct nomenclature for choropleth plot
# the dataset doesn't provide information about country codes
#so I'm taking my best guess about the largest nations that make sense given the codes provided
df['Country_code'] = df['Country'].replace({'SP': 'ESP', 'CA': 'CAN', 'US': 'USA', 'SA': 'ZAF', 'ME': 'MEX'})
# success of campaigns by country code
#using melt functions(melt()-Unpivot a DataFrame from wide to long format, optionally leaving identifiers set)
df_cam = df[['Country_code', 'AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Response']].melt(
id_vars='Country_code', var_name='Campaign', value_name='Accepted (%)')
df_cam.head()
| Country_code | Campaign | Accepted (%) | |
|---|---|---|---|
| 0 | ESP | AcceptedCmp1 | 0 |
| 1 | CAN | AcceptedCmp1 | 0 |
| 2 | USA | AcceptedCmp1 | 0 |
| 3 | AUS | AcceptedCmp1 | 0 |
| 4 | ESP | AcceptedCmp1 | 0 |
df_cam = pd.DataFrame(df_cam.groupby(['Country_code', 'Campaign'])['Accepted (%)'].mean()*100).reset_index(drop=False)
# rename the campaign variables so they're easier to interpret
df_cam['Campaign'] = df_cam['Campaign'].replace({'AcceptedCmp1': '1',
'AcceptedCmp2': '2',
'AcceptedCmp3': '3',
'AcceptedCmp4': '4',
'AcceptedCmp5': '5',
'Response': 'Most recent'
})
df_cam['Campaign'].head()
0 1 1 2 2 3 3 4 4 5 Name: Campaign, dtype: object
# choropleth plot
import plotly.express as px
fig = px.choropleth(df_cam, locationmode='ISO-3', color='Accepted (%)', facet_col='Campaign', facet_col_wrap=2,
facet_row_spacing=0.05, facet_col_spacing=0.01, width=700,
locations='Country_code', projection='natural earth', title='Advertising Campaign Success Rate by Country'
)
fig.show()
We need to find if the effect on regions on campaign success is significantly successful or not.
# calculate logistic regression p-values for campaign acceptance ~ country using generalized linear model
import statsmodels.formula.api as smf
import statsmodels as sm
from scipy import stats
## get the data of interest for Genralised Linear Model
df_cam_wide = df[['Country', 'AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Response']]
df_cam_wide.head()
| Country | AcceptedCmp1 | AcceptedCmp2 | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | Response | |
|---|---|---|---|---|---|---|---|
| 0 | SP | 0 | 0 | 0 | 0 | 0 | 1 |
| 1 | CA | 0 | 1 | 0 | 0 | 0 | 1 |
| 2 | US | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | AUS | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | SP | 0 | 0 | 1 | 0 | 0 | 1 |
## to store statistics results
stat_results = []
## perform glm
for col in df_cam_wide.drop(columns='Country').columns:
this_data = df_cam_wide[['Country', col]]
# define formula
formula = col+'~Country'
model = smf.glm(formula =formula, data =this_data, family=sm.genmod.families.Binomial())
result = model.fit()
# get chisquare value for overall model (CampaignAccepted ~ Country) and calculate p-value
chisq = result.pearson_chi2
pval = stats.distributions.chi2.sf(chisq, 7) #df Model =7 degrees of freedom when we run result.summary()
# append to stat_results
stat_results.append(pval)
# print stat summary for entire model
print(result.summary())
## check results
print("\nChisq p-values: ", stat_results)
Generalized Linear Model Regression Results
==============================================================================
Dep. Variable: Response No. Observations: 2237
Model: GLM Df Residuals: 2229
Model Family: Binomial Df Model: 7
Link Function: logit Scale: 1.0000
Method: IRLS Log-Likelihood: -937.28
Date: Sat, 20 Mar 2021 Deviance: 1874.6
Time: 23:19:09 Pearson chi2: 2.24e+03
No. Iterations: 5
Covariance Type: nonrobust
==================================================================================
coef std err z P>|z| [0.025 0.975]
----------------------------------------------------------------------------------
Intercept -1.7845 0.225 -7.919 0.000 -2.226 -1.343
Country[T.CA] -0.0160 0.285 -0.056 0.955 -0.575 0.543
Country[T.GER] -0.0170 0.345 -0.049 0.961 -0.694 0.660
Country[T.IND] -0.5484 0.368 -1.492 0.136 -1.269 0.172
Country[T.ME] 2.4776 1.245 1.990 0.047 0.037 4.918
Country[T.SA] 0.0868 0.271 0.320 0.749 -0.445 0.618
Country[T.SP] 0.1328 0.240 0.553 0.580 -0.337 0.603
Country[T.US] -0.2149 0.372 -0.578 0.563 -0.943 0.513
==================================================================================
Chisq p-values: [0.0, 0.0, 0.0]
Findings: The regional differences in advertising campaign success are statistically significant.
# plotting
## merge in the original country codes provided in the dataset
countries = df[['Country', 'Country_code']].drop_duplicates().reset_index(drop=True)
df_cam2 = df_cam.merge(countries, how='left', on='Country_code')
df_cam2.head()
## bar graphs
g = sns.FacetGrid(df_cam2, col='Campaign', col_wrap=3)
g.map(sns.barplot, 'Country', 'Accepted (%)')
for ax, pval in zip(g.axes.flat, stat_results):
ax.text(0, 65, "Chisq p-value: "+str(pval), fontsize=9) #add text;
We will now plot the marketing campaign with overall acceptance rates:
df.head()
| Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Recency | MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumDealsPurchases | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Response | Complain | Country | Dependents | TotalMnt | Year_Customer | TotalPurchases | TotalCampaignsAcc | Country_code | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1970 | Graduation | Divorced | 51381.5 | 0 | 0 | 0 | 189 | 104 | 379 | 111 | 189 | 218 | 1 | 4 | 4 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | SP | 0 | 4760 | 2014 | 30 | 1 | ESP |
| 1 | 1961 | Graduation | Single | 51381.5 | 0 | 0 | 0 | 464 | 5 | 64 | 7 | 0 | 37 | 1 | 7 | 3 | 7 | 5 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | CA | 0 | 2308 | 2014 | 36 | 2 | CAN |
| 2 | 1958 | Graduation | Married | 51381.5 | 0 | 1 | 0 | 134 | 11 | 59 | 15 | 2 | 30 | 1 | 3 | 2 | 5 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | US | 1 | 1004 | 2014 | 22 | 0 | USA |
| 3 | 1967 | Graduation | Together | 51381.5 | 1 | 1 | 0 | 10 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 2 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | AUS | 2 | 44 | 2014 | 8 | 0 | AUS |
| 4 | 1989 | Graduation | Single | 51381.5 | 1 | 0 | 0 | 6 | 16 | 24 | 11 | 0 | 34 | 2 | 3 | 1 | 2 | 7 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | SP | 1 | 364 | 2014 | 16 | 2 | ESP |
# calculate success rate (percent accepted)
cam_success = pd.DataFrame(df[['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Response']].mean()*100,
columns=['Percent']).reset_index()
cam_success.head()
| index | Percent | |
|---|---|---|
| 0 | AcceptedCmp1 | 6.437193 |
| 1 | AcceptedCmp2 | 1.341082 |
| 2 | AcceptedCmp3 | 7.286544 |
| 3 | AcceptedCmp4 | 7.465355 |
| 4 | AcceptedCmp5 | 7.241842 |
We have to plot between 'Index' and 'percent'
# plot
sns.barplot(x='Percent', y='index', data=cam_success.sort_values('Percent'), palette='Blues')
plt.xlabel('Accepted (%)')
plt.ylabel('Campaign')
plt.title('Marketing campaign success rate', size=16)
Text(0.5, 1.0, 'Marketing campaign success rate')
We conclude that the most successful campaign is the most recent one. Column: 'Response'
#Finding the average customer look like for the company:
# list of cols with binary responses
binary_cols = [col for col in df.columns if 'Accepted' in col] + ['Response', 'Complain']
binary_cols
['AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1', 'AcceptedCmp2', 'Response', 'Complain']
# list of cols for spending
mnt_cols = [col for col in df.columns if 'Mnt' in col]
mnt_cols
['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'TotalMnt']
# list of cols for channels
channel_cols = [col for col in df.columns if 'Num' in col] + ['TotalPurchases', 'TotalCampaignsAcc']
channel_cols
['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth', 'TotalPurchases', 'TotalCampaignsAcc']
# average customer demographics
demographics = pd.DataFrame(round(df.drop(columns=binary_cols+mnt_cols+channel_cols).mean(), 1), columns=['Average']).reindex([
'Year_Birth', 'Year_Customer', 'Income', 'Dependents', 'Kidhome', 'Teenhome', 'Recency'])
demographics
| Average | |
|---|---|
| Year_Birth | 1968.9 |
| Year_Customer | 2013.0 |
| Income | 51381.5 |
| Dependents | 1.0 |
| Kidhome | 0.4 |
| Teenhome | 0.5 |
| Recency | 49.1 |
We conclude that:
#WHICH PRODUCT ARE PERFORMING THE BEST?
spending = pd.DataFrame(round(df[mnt_cols].mean(), 1), columns=['Average']).sort_values(by='Average').reset_index()
spending.head()
| index | Average | |
|---|---|---|
| 0 | MntFruits | 26.3 |
| 1 | MntSweetProducts | 27.1 |
| 2 | MntFishProducts | 37.5 |
| 3 | MntGoldProds | 44.0 |
| 4 | MntMeatProducts | 166.9 |
# plot
ax = sns.barplot(x='Average', y='index', data=spending, palette='Blues')
plt.ylabel('Amount spent on...')
## add text labels for each bar's value
for p,q in zip(ax.patches, spending['Average']):
ax.text(x=q+40,
y=p.get_y()+0.5,
s=q,
ha="center") ;
## add text labels for each bar's value
for p,q in zip(ax.patches, spending['Average']):
ax.text(x=q+40,
y=p.get_y()+0.5,
s=q,
ha="center") ;
The average customer spent...
Products performing best: Wines Followed by meats
Lets find out which channels are underperforming:
channels = pd.DataFrame(round(df[channel_cols].mean(),1), columns = ['Average']).sort_values(by='Average').reset_index()
# plot
ax = sns.barplot(x='Average', y='index', data=channels, palette='Blues')
plt.ylabel('Number of...')
## add text labels for each bar's value
for p,q in zip(ax.patches, channels['Average']):
ax.text(x=q+0.8,
y=p.get_y()+0.5,
s=q,
ha="center") ;
Underperforming channels:
Thank you!
I will add a PDF where I will showcase my summary of the whole process of Marketing Analytics